AIT 526 - Project

Sentiment Analysis of Amazon Online Purchase Ratings

Team 2

     1. Fangxin Zhang
     2. Muhammad Hassan
     3. Shirinithi Thiruppathi
     4. Tewodros Tamene

Dr. Duoduo Liao December 3, 2021

Note:

In this project notebook all the code that was developed for the final outcome is included. This notebook includes:

1. Libraries
2. Importing Data
3. Data Exploration
4. Data Preprocessing
5. Text Preprocessing
6. Exploratory Data Analysis
7. Popularity Based Recommender Analysis
8. Time-Series Analysis
9. TextBlob - Polarity
10. Text Analysis
11. Feature Engineering
12. TF-IDF
13. Sentiment Model Development
14. Model Selection
15. Hyperparameter Tuning
16. Classification Metrics
17. Code References    

1. Libraries

In [1]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen

import random
import numpy as np
from tqdm import tqdm_notebook as tqdm
from collections import defaultdict
In [2]:
#Basic libraries
import pandas as pd 
import numpy as np 


#NLTK libraries
import nltk
import re
import string
#from wordcloud import WordCloud,STOPWORDS
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# Machine Learning libraries
import sklearn 
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn import svm, datasets
from sklearn import preprocessing 

#Metrics libraries
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc

#Visualization libraries
import matplotlib.pyplot as plt 
from matplotlib import rcParams
import seaborn as sns
from textblob import TextBlob
from plotly import tools
import plotly.graph_objs as go
from plotly.offline import iplot
%matplotlib inline

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

#Other miscellaneous libraries
from scipy import interp
from itertools import cycle
#import cufflinks as cf
from collections import defaultdict
from collections import Counter
from imblearn.over_sampling import SMOTE
In [3]:
import cufflinks as cf
import plotly.offline

cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

2. Importing Data

In [4]:
import pandas as pd
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('Software.json.gz')
In [5]:
df.head()
Out[5]:
overall verified reviewTime reviewerID asin style reviewerName reviewText summary unixReviewTime vote image
0 4.0 True 03 11, 2014 A240ORQ2LF9LUI 0077613252 {'Format:': ' Loose Leaf'} Michelle W The materials arrived early and were in excell... Material Great 1394496000 NaN NaN
1 4.0 True 02 23, 2014 A1YCCU0YRLS0FE 0077613252 {'Format:': ' Loose Leaf'} Rosalind White Ames I am really enjoying this book with the worksh... Health 1393113600 NaN NaN
2 1.0 True 02 17, 2014 A1BJHRQDYVAY2J 0077613252 {'Format:': ' Loose Leaf'} Allan R. Baker IF YOU ARE TAKING THIS CLASS DON"T WASTE YOUR ... ARE YOU KIDING ME? 1392595200 7 NaN
3 3.0 True 02 17, 2014 APRDVZ6QBIQXT 0077613252 {'Format:': ' Loose Leaf'} Lucy This book was missing pages!!! Important pages... missing pages!! 1392595200 3 NaN
4 5.0 False 10 14, 2013 A2JZTTBSLS1QXV 0077775473 NaN Albert V. I have used LearnSmart and can officially say ... Best study product out there! 1381708800 NaN NaN
In [6]:
# Rating average

ratings = []

for review in parse("Software.json.gz"):
  ratings.append(review['overall'])

print(sum(ratings) / len(ratings))
3.5701751712969814
In [7]:
# Creating a copy dataframe

process_reviews =df.copy()

3. Data Exploration

In [8]:
# Shape of the data

process_reviews.shape
Out[8]:
(459436, 12)
In [9]:
# Check the datatypes

process_reviews.dtypes
Out[9]:
overall           float64
verified             bool
reviewTime         object
reviewerID         object
asin               object
style              object
reviewerName       object
reviewText         object
summary            object
unixReviewTime      int64
vote               object
image              object
dtype: object
In [10]:
# Five point summary 

process_reviews.describe()['overall'].T
Out[10]:
count    459436.000000
mean          3.570175
std           1.626662
min           1.000000
25%           2.000000
50%           4.000000
75%           5.000000
max           5.000000
Name: overall, dtype: float64
In [11]:
# Find the minimum and maximum ratings

print('Minimum rating is: %d' %(process_reviews.overall.min()))
print('Maximum rating is: %d' %(process_reviews.overall.max()))
Minimum rating is: 1
Maximum rating is: 5

4. Data Preprocessing

Handle missing data

In [12]:
# Checking for null values

process_reviews.isnull().sum()
Out[12]:
overall                0
verified               0
reviewTime             0
reviewerID             0
asin                   0
style             225035
reviewerName          24
reviewText            66
summary               56
unixReviewTime         0
vote              331583
image             457928
dtype: int64
In [13]:
process_reviews['style']=process_reviews['style'].fillna('Missing')
In [14]:
process_reviews['reviewerName']=process_reviews['reviewerName'].fillna('Missing')
In [15]:
process_reviews['vote']=process_reviews['vote'].fillna('Missing')
In [16]:
process_reviews['image']=process_reviews['image'].fillna('Missing')
In [17]:
process_reviews = process_reviews[process_reviews['reviewText'].notna()]
process_reviews = process_reviews[process_reviews['summary'].notna()]
In [18]:
process_reviews.isnull().sum()
Out[18]:
overall           0
verified          0
reviewTime        0
reviewerID        0
asin              0
style             0
reviewerName      0
reviewText        0
summary           0
unixReviewTime    0
vote              0
image             0
dtype: int64

Concatenating Review Text and Summary

In [19]:
process_reviews['reviews']=process_reviews['reviewText']+process_reviews['summary']
process_reviews=process_reviews.drop(['reviewText', 'summary'], axis=1)
process_reviews.head()
Out[19]:
overall verified reviewTime reviewerID asin style reviewerName unixReviewTime vote image reviews
0 4.0 True 03 11, 2014 A240ORQ2LF9LUI 0077613252 {'Format:': ' Loose Leaf'} Michelle W 1394496000 Missing Missing The materials arrived early and were in excell...
1 4.0 True 02 23, 2014 A1YCCU0YRLS0FE 0077613252 {'Format:': ' Loose Leaf'} Rosalind White Ames 1393113600 Missing Missing I am really enjoying this book with the worksh...
2 1.0 True 02 17, 2014 A1BJHRQDYVAY2J 0077613252 {'Format:': ' Loose Leaf'} Allan R. Baker 1392595200 7 Missing IF YOU ARE TAKING THIS CLASS DON"T WASTE YOUR ...
3 3.0 True 02 17, 2014 APRDVZ6QBIQXT 0077613252 {'Format:': ' Loose Leaf'} Lucy 1392595200 3 Missing This book was missing pages!!! Important pages...
4 5.0 False 10 14, 2013 A2JZTTBSLS1QXV 0077775473 Missing Albert V. 1381708800 Missing Missing I have used LearnSmart and can officially say ...

Creating 'sentiment' column

In [20]:
process_reviews['overall'].value_counts()
Out[20]:
5.0    212374
1.0    102528
4.0     73586
3.0     39390
2.0     31442
Name: overall, dtype: int64
In [21]:
def f(row):
    
    '''This function returns sentiment value based on the overall ratings from the user'''
    
    if row['overall'] == 3.0:
        val = 'Neutral'
    elif row['overall'] == 1.0 or row['overall'] == 2.0:
        val = 'Negative'
    elif row['overall'] == 4.0 or row['overall'] == 5.0:
        val = 'Positive'
    else:
        val = -1
    return val
In [22]:
# Applying the function in our new column

process_reviews['sentiment'] = process_reviews.apply(f, axis=1)
process_reviews.head()
Out[22]:
overall verified reviewTime reviewerID asin style reviewerName unixReviewTime vote image reviews sentiment
0 4.0 True 03 11, 2014 A240ORQ2LF9LUI 0077613252 {'Format:': ' Loose Leaf'} Michelle W 1394496000 Missing Missing The materials arrived early and were in excell... Positive
1 4.0 True 02 23, 2014 A1YCCU0YRLS0FE 0077613252 {'Format:': ' Loose Leaf'} Rosalind White Ames 1393113600 Missing Missing I am really enjoying this book with the worksh... Positive
2 1.0 True 02 17, 2014 A1BJHRQDYVAY2J 0077613252 {'Format:': ' Loose Leaf'} Allan R. Baker 1392595200 7 Missing IF YOU ARE TAKING THIS CLASS DON"T WASTE YOUR ... Negative
3 3.0 True 02 17, 2014 APRDVZ6QBIQXT 0077613252 {'Format:': ' Loose Leaf'} Lucy 1392595200 3 Missing This book was missing pages!!! Important pages... Neutral
4 5.0 False 10 14, 2013 A2JZTTBSLS1QXV 0077775473 Missing Albert V. 1381708800 Missing Missing I have used LearnSmart and can officially say ... Positive
In [23]:
# Sentiment count

process_reviews['sentiment'].value_counts()
Out[23]:
Positive    285960
Negative    133970
Neutral      39390
Name: sentiment, dtype: int64

Handling time column

In [24]:
# new data frame which has date and year

new = process_reviews["reviewTime"].str.split(",", n = 1, expand = True) 
  
# making separate date column from new data frame

process_reviews["date"]= new[0] 
  
# making separate year column from new data frame

process_reviews["year"]= new[1] 
process_reviews=process_reviews.drop(['reviewTime'], axis=1)
process_reviews.head()
Out[24]:
overall verified reviewerID asin style reviewerName unixReviewTime vote image reviews sentiment date year
0 4.0 True A240ORQ2LF9LUI 0077613252 {'Format:': ' Loose Leaf'} Michelle W 1394496000 Missing Missing The materials arrived early and were in excell... Positive 03 11 2014
1 4.0 True A1YCCU0YRLS0FE 0077613252 {'Format:': ' Loose Leaf'} Rosalind White Ames 1393113600 Missing Missing I am really enjoying this book with the worksh... Positive 02 23 2014
2 1.0 True A1BJHRQDYVAY2J 0077613252 {'Format:': ' Loose Leaf'} Allan R. Baker 1392595200 7 Missing IF YOU ARE TAKING THIS CLASS DON"T WASTE YOUR ... Negative 02 17 2014
3 3.0 True APRDVZ6QBIQXT 0077613252 {'Format:': ' Loose Leaf'} Lucy 1392595200 3 Missing This book was missing pages!!! Important pages... Neutral 02 17 2014
4 5.0 False A2JZTTBSLS1QXV 0077775473 Missing Albert V. 1381708800 Missing Missing I have used LearnSmart and can officially say ... Positive 10 14 2013
In [25]:
# Splitting the date 

new1 = process_reviews["date"].str.split(" ", n = 1, expand = True) 
  
# adding month to the main dataset 

process_reviews["month"]= new1[0] 
  
# adding day to the main dataset 

process_reviews["day"]= new1[1] 
process_reviews=process_reviews.drop(['date'], axis=1)
process_reviews.head()
Out[25]:
overall verified reviewerID asin style reviewerName unixReviewTime vote image reviews sentiment year month day
0 4.0 True A240ORQ2LF9LUI 0077613252 {'Format:': ' Loose Leaf'} Michelle W 1394496000 Missing Missing The materials arrived early and were in excell... Positive 2014 03 11
1 4.0 True A1YCCU0YRLS0FE 0077613252 {'Format:': ' Loose Leaf'} Rosalind White Ames 1393113600 Missing Missing I am really enjoying this book with the worksh... Positive 2014 02 23
2 1.0 True A1BJHRQDYVAY2J 0077613252 {'Format:': ' Loose Leaf'} Allan R. Baker 1392595200 7 Missing IF YOU ARE TAKING THIS CLASS DON"T WASTE YOUR ... Negative 2014 02 17
3 3.0 True APRDVZ6QBIQXT 0077613252 {'Format:': ' Loose Leaf'} Lucy 1392595200 3 Missing This book was missing pages!!! Important pages... Neutral 2014 02 17
4 5.0 False A2JZTTBSLS1QXV 0077775473 Missing Albert V. 1381708800 Missing Missing I have used LearnSmart and can officially say ... Positive 2013 10 14

5. Text Preprocessing

Review Text - Punctuation Cleaning

In [28]:
#Removing unnecessary columns

process_reviews=process_reviews.drop(['reviewerName','unixReviewTime'], axis=1)

#Creating a copy 

clean_reviews=process_reviews.copy()
In [29]:
def review_cleaning(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text
In [30]:
process_reviews['reviews'] = process_reviews['reviews'].apply(lambda x:review_cleaning(x))
process_reviews.head()
Out[30]:
overall verified reviewerID asin style vote image reviews sentiment year month day
0 4.0 True A240ORQ2LF9LUI 0077613252 {'Format:': ' Loose Leaf'} Missing Missing the materials arrived early and were in excell... Positive 2014 03 11
1 4.0 True A1YCCU0YRLS0FE 0077613252 {'Format:': ' Loose Leaf'} Missing Missing i am really enjoying this book with the worksh... Positive 2014 02 23
2 1.0 True A1BJHRQDYVAY2J 0077613252 {'Format:': ' Loose Leaf'} 7 Missing if you are taking this class dont waste your m... Negative 2014 02 17
3 3.0 True APRDVZ6QBIQXT 0077613252 {'Format:': ' Loose Leaf'} 3 Missing this book was missing pages important pages i ... Neutral 2014 02 17
4 5.0 False A2JZTTBSLS1QXV 0077775473 Missing Missing Missing i have used learnsmart and can officially say ... Positive 2013 10 14

Review text-Stop words

In [31]:
stop_words= ['yourselves', 'between', 'whom', 'itself', 'is', "she's", 'up', 'herself', 'here', 'your', 'each', 
             'we', 'he', 'my', "you've", 'having', 'in', 'both', 'for', 'themselves', 'are', 'them', 'other',
             'and', 'an', 'during', 'their', 'can', 'yourself', 'she', 'until', 'so', 'these', 'ours', 'above', 
             'what', 'while', 'have', 're', 'more', 'only', "needn't", 'when', 'just', 'that', 'were', "don't", 
             'very', 'should', 'any', 'y', 'isn', 'who',  'a', 'they', 'to', 'too', "should've", 'has', 'before',
             'into', 'yours', "it's", 'do', 'against', 'on',  'now', 'her', 've', 'd', 'by', 'am', 'from', 'about', 'further', "that'll", "you'd", 'you', 'as', 'how', 'been', 'the', 'or', 'doing', 'such',
             'his', 'himself', 'ourselves',  'was', 'through', 'out', 'below', 'own', 'myself', 'theirs', 
             'me', 'why', 'once',  'him', 'than', 'be', 'most', "you'll", 'same', 'some', 'with', 'few', 'it',
             'at', 'after', 'its', 'which', 'there','our', 'this', 'hers', 'being', 'did', 'of', 'had', 'under',
             'over','again', 'where', 'those', 'then', "you're", 'i', 'because', 'does', 'all']
In [32]:
process_reviews['reviews'] = process_reviews['reviews'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
process_reviews.head()
Out[32]:
overall verified reviewerID asin style vote image reviews sentiment year month day
0 4.0 True A240ORQ2LF9LUI 0077613252 {'Format:': ' Loose Leaf'} Missing Missing materials arrived early excellent condition ho... Positive 2014 03 11
1 4.0 True A1YCCU0YRLS0FE 0077613252 {'Format:': ' Loose Leaf'} Missing Missing really enjoying book worksheets make review go... Positive 2014 02 23
2 1.0 True A1BJHRQDYVAY2J 0077613252 {'Format:': ' Loose Leaf'} 7 Missing if taking class dont waste money called book b... Negative 2014 02 17
3 3.0 True APRDVZ6QBIQXT 0077613252 {'Format:': ' Loose Leaf'} 3 Missing book missing pages important pages couldnt ans... Neutral 2014 02 17
4 5.0 False A2JZTTBSLS1QXV 0077775473 Missing Missing Missing used learnsmart officially say amazing study t... Positive 2013 10 14

6. Exploratory Data Analysis

In [33]:
# Unique customers and products

print("Total data ")
print("-"*50)
print("\nTotal No of Ratings  :",process_reviews.shape[0])
print("Total No of Reviwers :", len(np.unique(process_reviews.reviewerID)))
print("Total No of Products :", len(np.unique(process_reviews.asin)))
Total data 
--------------------------------------------------

Total No of Ratings  : 459320
Total No of Reviwers : 375047
Total No of Products : 21659
In [34]:
# Analysis of rating given by the customer 
# Top five customers who have given the most reviews

no_of_rated_products_per_user = process_reviews.groupby(by='reviewerID')['overall'].count().sort_values(ascending=False)

no_of_rated_products_per_user.head()
Out[34]:
reviewerID
A5JLAU2ARJ0BO     73
A680RUE1FDO8B     71
A225G2TFM76GYX    69
A3W4D8XOGLWUN5    68
A15S4XW3CRISZ5    65
Name: overall, dtype: int64

Distribution of overall Reviews

In [35]:
f, ax = plt.subplots(figsize=(8,5))
ax = process_reviews.overall.value_counts().plot(kind="bar", color = "blue")
ax.set_title("Frequency Distribution of Ratings")
ax.set_xticklabels(df.overall.value_counts().index, rotation = 30)
plt.show()

Distribution of the Target Variable

In [36]:
f, ax = plt.subplots(figsize=(8, 6))
ax = sns.countplot(y="sentiment", data=process_reviews, palette="Set1")
ax.set_title("Frequency Distribution of Sentiment Variable")
plt.show()

7. Popularity Based Recommender Analysis

In [37]:
#Getting the new dataframe which contains users who has given 50 or more ratings

new_df = process_reviews.groupby("asin").filter(lambda x:x['overall'].count() >=50)
In [38]:
# Average rating of the product 

new_df.groupby('asin')['overall'].mean().head()
Out[38]:
asin
0615179088    4.627119
0763855553    3.942688
0842340351    4.523077
1413313701    3.908163
1426296355    3.976562
Name: overall, dtype: float64
In [39]:
# Average rating per product decreasing order

new_df.groupby('asin')['overall'].mean().sort_values(ascending=False).head()
Out[39]:
asin
B000050ZRE    4.937705
B00SX73LIK    4.930769
B000EORV8Q    4.928793
B0001FS9NE    4.915033
B0000AZJY6    4.897569
Name: overall, dtype: float64
In [40]:
# Total no of rating for product

new_df.groupby('asin')['overall'].count().sort_values(ascending=False).head()
Out[40]:
asin
B00UB76290    8989
B00CTTEKJW    7932
B00NG7JVSQ    6394
B00H9A60O4    4729
B00E6LJ2SA    4048
Name: overall, dtype: int64
In [41]:
# Top 30 popular software products

popular_products = pd.DataFrame(new_df.groupby('asin')['overall'].count())
most_popular = popular_products.sort_values('overall', ascending=False)
most_popular.head(30).plot(kind = "bar")
plt.title('Top 30 popular software products')
plt.xlabel('Product Asin/Product Number')
plt.ylabel('Product Count')
plt.show()

8. Time Series Analysis

Year vs Sentiment count

In [42]:
process_reviews.groupby(['year','sentiment'])['sentiment'].count().unstack().plot(legend=True)
plt.title('Year and Sentiment count')
plt.xlabel('Year')
plt.ylabel('Sentiment count')
plt.show()

Month vs Sentiment count

In [43]:
process_reviews.groupby(['month','sentiment'])['sentiment'].count().unstack().plot(legend=True)
plt.title('Month and Sentiment count')
plt.xlabel('Month')
plt.ylabel('Sentiment count')
plt.show()

Day of Month vs Reviews count

In [44]:
#Creating a dataframe

day=pd.DataFrame(process_reviews.groupby('day')['reviews'].count()).reset_index()
day['day']=day['day'].astype('int64')
day.sort_values(by=['day'])

#Plotting the graph

sns.barplot(x="day", y="reviews", data=day)
plt.title('Day vs Reviews count')
plt.xlabel('Day')
plt.ylabel('Reviews count')
plt.show()

Month vs Reviews count

In [45]:
#Creating a dataframe

month=pd.DataFrame(process_reviews.groupby('month')['reviews'].count()).reset_index()
month['month']=month['month'].astype('int64')
month.sort_values(by=['month'])

#Plotting the graph

sns.barplot(x="month", y="reviews", data=month)
plt.title('Month vs Reviews count')
plt.xlabel('Month')
plt.ylabel('Reviews count')
plt.show()

Year vs Reviews count

In [46]:
#Creating a dataframe

year=pd.DataFrame(process_reviews.groupby('year')['reviews'].count()).reset_index()
year['year']=year['year'].astype('int64')
year.sort_values(by=['year'])

#Plotting the graph
f, ax = plt.subplots(figsize=(15, 6))
sns.barplot(x="year", y="reviews", data=year)
plt.title('Year vs Reviews count')
plt.xlabel('Year')
plt.ylabel('Reviews count'),
plt.show()

9. TextBlob Polarity - Sentiment Classifier

In [47]:
process_reviews['polarity'] = process_reviews['reviews'].map(lambda text: TextBlob(text).sentiment.polarity)

Review Length and Word Count

In [48]:
process_reviews['review_len'] = process_reviews['reviews'].astype(str).apply(len)
process_reviews['word_count'] = process_reviews['reviews'].apply(lambda x: len(str(x).split()))
In [49]:
process_reviews.head()
Out[49]:
overall verified reviewerID asin style vote image reviews sentiment year month day polarity review_len word_count
0 4.0 True A240ORQ2LF9LUI 0077613252 {'Format:': ' Loose Leaf'} Missing Missing materials arrived early excellent condition ho... Positive 2014 03 11 0.339744 120 16
1 4.0 True A1YCCU0YRLS0FE 0077613252 {'Format:': ' Loose Leaf'} Missing Missing really enjoying book worksheets make review go... Positive 2014 02 23 0.250000 98 13
2 1.0 True A1BJHRQDYVAY2J 0077613252 {'Format:': ' Loose Leaf'} 7 Missing if taking class dont waste money called book b... Negative 2014 02 17 -0.094231 176 29
3 3.0 True APRDVZ6QBIQXT 0077613252 {'Format:': ' Loose Leaf'} 3 Missing book missing pages important pages couldnt ans... Neutral 2014 02 17 0.100000 97 13
4 5.0 False A2JZTTBSLS1QXV 0077775473 Missing Missing Missing used learnsmart officially say amazing study t... Positive 2013 10 14 0.333333 394 54

Sentiment polarity distribution

In [50]:
process_reviews['polarity'].iplot(
    kind='hist',
    bins=50,
    xTitle='polarity',
    linecolor='black',
    yTitle='count',
    title='Sentiment Polarity Distribution')

Review Rating Distribution

In [51]:
process_reviews['overall'].iplot(
    kind='hist',
    xTitle='rating',
    linecolor='black',
    yTitle='count',
    title='Review Rating Distribution')

Review Text Length Distribution

In [52]:
process_reviews['review_len'].iplot(
    kind='hist',
    bins=100,
    xTitle='review length',
    linecolor='black',
    yTitle='count',
    title='Review Text Length Distribution')

Review Text Word Count Distribution

In [53]:
process_reviews['word_count'].iplot(
    kind='hist',
    bins=100,
    xTitle='word count',
    linecolor='black',
    yTitle='count',
    title='Review Text Word Count Distribution')

10. Text Analysis

N-gram analysis

Monogram analysis

In [54]:
from nltk.corpus import stopwords
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
In [55]:
# Filtering data

review_pos = process_reviews[process_reviews["sentiment"]=='Positive'].dropna()
review_neu = process_reviews[process_reviews["sentiment"]=='Neutral'].dropna()
review_neg = process_reviews[process_reviews["sentiment"]=='Negative'].dropna()


# custom function for ngram generation 

def generate_ngrams(text, n_gram=1):
    token = [token for token in text.lower().split(" ") if token != "" if token not in stopwords.words('english')]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [" ".join(ngram) for ngram in ngrams]
In [56]:
# custom function for horizontal bar chart

def horizontal_bar_chart(df, color):
    trace = go.Bar(
        y=df["word"].values[::-1],
        x=df["wordcount"].values[::-1],
        showlegend=False,
        orientation = 'h',
        marker=dict(
            color=color,
        ),
    )
    return trace
In [57]:
# Get the bar chart from positive reviews 

freq_dict = defaultdict(int)
for sent in review_pos["reviews"]:
    for word in generate_ngrams(sent):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace0 = horizontal_bar_chart(fd_sorted.head(25), 'green')
In [58]:
# Get the bar chart from neutral reviews 
freq_dict = defaultdict(int)
for sent in review_neu["reviews"]:
    for word in generate_ngrams(sent):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace1 = horizontal_bar_chart(fd_sorted.head(25), 'grey')

# Get the bar chart from negative reviews 
freq_dict = defaultdict(int)
for sent in review_neg["reviews"]:
    for word in generate_ngrams(sent):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace2 = horizontal_bar_chart(fd_sorted.head(25), 'red')

# Creating two subplots
fig = tools.make_subplots(rows=3, cols=1, vertical_spacing=0.04,
                          subplot_titles=["Frequent words of positive reviews", "Frequent words of neutral reviews",
                                          "Frequent words of negative reviews"])
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 2, 1)
fig.append_trace(trace2, 3, 1)
fig['layout'].update(height=1200, width=900, paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
iplot(fig, filename='word-plots')

Bigram Analysis

In [59]:
# Get the bar chart from positive reviews

freq_dict = defaultdict(int)
for sent in review_pos["reviews"]:
    for word in generate_ngrams(sent,2):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace0 = horizontal_bar_chart(fd_sorted.head(25), 'green')
In [60]:
# Get the bar chart from neutral reviews

freq_dict = defaultdict(int)
for sent in review_neu["reviews"]:
    for word in generate_ngrams(sent,2):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace1 = horizontal_bar_chart(fd_sorted.head(25), 'grey')
In [61]:
# Get the bar chart from negative reviews

freq_dict = defaultdict(int)
for sent in review_neg["reviews"]:
    for word in generate_ngrams(sent,2):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace2 = horizontal_bar_chart(fd_sorted.head(25), 'brown')
In [62]:
# Creating two subplots

fig = tools.make_subplots(rows=3, cols=1, vertical_spacing=0.04,horizontal_spacing=0.25,
                          subplot_titles=["Bigram plots of Positive reviews", 
                                          "Bigram plots of Neutral reviews",
                                          "Bigram plots of Negative reviews"
                                          ])
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 2, 1)
fig.append_trace(trace2, 3, 1)


fig['layout'].update(height=1000, width=800, paper_bgcolor='rgb(233,233,233)', title="Bigram Plots")
iplot(fig, filename='word-plots')

Trigram analysis

In [63]:
# Get the bar chart from positive reviews

freq_dict = defaultdict(int)
for sent in review_pos["reviews"]:
    for word in generate_ngrams(sent,3):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace0 = horizontal_bar_chart(fd_sorted.head(25), 'green')

# Get the bar chart from neutral reviews 

freq_dict = defaultdict(int)
for sent in review_neu["reviews"]:
    for word in generate_ngrams(sent,3):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace1 = horizontal_bar_chart(fd_sorted.head(25), 'grey')

# Get the bar chart from negative reviews 

freq_dict = defaultdict(int)
for sent in review_neg["reviews"]:
    for word in generate_ngrams(sent,3):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace2 = horizontal_bar_chart(fd_sorted.head(25), 'red')

# Creating two subplots

fig = tools.make_subplots(rows=3, cols=1, vertical_spacing=0.04, horizontal_spacing=0.05,
                          subplot_titles=["Tri-gram plots of Positive reviews", 
                                          "Tri-gram plots of Neutral reviews",
                                          "Tri-gram plots of Negative reviews"])
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 2, 1)
fig.append_trace(trace2, 3, 1)
fig['layout'].update(height=1200, width=1200, paper_bgcolor='rgb(233,233,233)', title="Trigram Count Plots")
iplot(fig, filename='word-plots')

Wordcloud

Wordcloud-Positive reviews

In [52]:
import os
import nltk
from os import path
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import os
from os import path
from wordcloud import WordCloud
from wordcloud import WordCloud, STOPWORDS
In [53]:
text = review_pos["reviews"]
wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black',
    stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

Wordcloud-Neutral reviews

In [54]:
text = review_neu["reviews"]
wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black',
    stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

Wordcloud-Negative reviews

In [55]:
text = review_neg["reviews"]
wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black',
    stopwords = stop_words).generate(str(text))
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

11. Feature Engineering

Encoding Target Variable - Sentiment

In [64]:
# calling the label encoder function

label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column 'sentiment'

process_reviews['sentiment']= label_encoder.fit_transform(process_reviews['sentiment']) 
  
process_reviews['sentiment'].unique()
Out[64]:
array([2, 0, 1])
In [65]:
process_reviews['sentiment'].value_counts()
Out[65]:
2    285960
0    133970
1     39390
Name: sentiment, dtype: int64
In [50]:
f, ax = plt.subplots(figsize=(8, 6))
ax = sns.countplot(y="sentiment", data=process_reviews, palette="Set1")
ax.set_title("Frequency Distribution of Sentiment")
plt.show()

Stemming the reviews

In [66]:
# Extracting 'reviews' for processing

review_features=process_reviews.copy()
review_features=review_features[['reviews']].reset_index(drop=True)
review_features.head()
Out[66]:
reviews
0 materials arrived early excellent condition ho...
1 really enjoying book worksheets make review go...
2 if taking class dont waste money called book b...
3 book missing pages important pages couldnt ans...
4 used learnsmart officially say amazing study t...
In [67]:
# Performing stemming on the review dataframe

ps = PorterStemmer()

# splitting and adding the stemmed words except stopwords

corpus = []
for i in range(0, len(review_features)):
    review = re.sub('[^a-zA-Z]', ' ', review_features['reviews'][i])
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stop_words]
    review = ' '.join(review)
    corpus.append(review)    
In [68]:
corpus[3]
Out[68]:
'book miss page import page couldnt answer test question never happen beforemiss page'

12. TFIDF (Term Frequency — Inverse Document Frequency)

In [69]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(2,2))

# TF-IDF feature matrix

X= tfidf_vectorizer.fit_transform(review_features['reviews'])
In [70]:
X.shape
Out[70]:
(459320, 5000)
In [72]:
# Getting the target variable(encoded)

y=process_reviews['sentiment']

13. Sentiment Model Development

Handling Imbalance target feature with SMOTE

In [73]:
from scipy import interp
from itertools import cycle
import cufflinks as cf
from collections import defaultdict
from collections import Counter
from imblearn.over_sampling import SMOTE
In [74]:
print(f'Original dataset shape : {Counter(y)}')

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

print(f'Resampled dataset shape {Counter(y_res)}')
Original dataset shape : Counter({2: 285960, 0: 133970, 1: 39390})
Resampled dataset shape Counter({2: 285960, 0: 285960, 1: 285960})

Train-Test Split (75:25)

In [77]:
# Divide the dataset into Train and Test

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.25, random_state=0)
In [78]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i in range (cm.shape[0]):
        for j in range (cm.shape[1]):
            plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
            
            
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

14. Model Selection

In [79]:
# creating the objects

logreg_cv = LogisticRegression(random_state=0)
dt_cv=DecisionTreeClassifier()
knn_cv=KNeighborsClassifier()
cv_dict = {0: 'Logistic Regression', 1: 'Decision Tree',2:'KNN'}
cv_models=[logreg_cv,dt_cv,knn_cv]

for i,model in enumerate(cv_models):
    print("{} Test Accuracy: {}".format(cv_dict[i],cross_val_score(model, X, y, cv=10, scoring ='accuracy').mean()))
Logistic Regression Test Accuracy: 0.7985413219541933
Decision Tree Test Accuracy: 0.723811286249238
KNN Test Accuracy: 0.6699403465993208

15. Hyperparameter Tuning

In [80]:
# Hyperparameter tuning for Logistic Regression

param_grid = {'C': np.logspace(-4, 4, 50),
             'penalty':['l1', 'l2']}
clf = GridSearchCV(LogisticRegression(random_state=0), param_grid,cv=5, verbose=0,n_jobs=-1)
best_model = clf.fit(X_train,y_train)
print(best_model.best_estimator_)
print("The mean accuracy of the model is:",best_model.score(X_test,y_test))
LogisticRegression(C=2.559547922699533, random_state=0)
The mean accuracy of the model is: 0.6863990301673893

Parameters and Running Final Model

In [81]:
logreg = LogisticRegression(C=2.559, random_state=0)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
Accuracy of logistic regression classifier on test set: 0.69

16. Classification Metrics

Confusion Matrix

In [82]:
cm = metrics.confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, classes=['Negative','Neutral','Positive'])
Confusion matrix, without normalization

Classification Report

In [83]:
print("Classification Report:\n",classification_report(y_test, y_pred))
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.66      0.70     71687
           1       0.58      0.71      0.64     71272
           2       0.79      0.69      0.74     71511

    accuracy                           0.69    214470
   macro avg       0.70      0.69      0.69    214470
weighted avg       0.70      0.69      0.69    214470

ROC-AUC curve

In [84]:
# Binarizing the target feature

y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]

# Train-Test split(80:20)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,
                                                    random_state=0)

# OneVsRestClassifier

classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
                                 random_state=10))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

# Computing TPR and FPR

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area

fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# aggregate all false positive rates

all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# interpolate all ROC curves at this points

mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC

mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves

plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)


colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=4,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=4)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.show()

17. Code References

1. Dr. Liao's code examples & tutorials, code snippets & hints
2. Li, S. (2018).A Complete Exploratory Data Analysis and Visualization for Text Data. Retrived from                                https://towardsdatascience.com/a-complete-exploratory-data-analysis-and-visualization-for-text-data 29fb1b96fb6a
3. Rohith, R. (2018). Sentiment Extraction:Understanding metric+EDA. 
   Retrived from https://www.kaggle.com/ratan123/sentiment-extraction-understanding-metric-eda
4. Scikit Learn. (n.d.). Retrived from https://scikit-learn.org/stable/auto_examples/
5. Ni, J. (2018). Amazon Review Data. https://nijianmo.github.io/amazon/